In [1]:
import pandas as pd
log = pd.read_csv("../dataset/linux_blame_log.csv")
log.head()
Out[1]:
In [2]:
log.info()
In [3]:
top10 = log.author.value_counts().head(10)
top10
Out[3]:
In [4]:
%matplotlib inline
top10.plot.pie(figsize=[5,5],
title="Top 10 Wissensträger",
label="");
In [5]:
log.timestamp = pd.to_datetime(log.timestamp)
log.head()
Out[5]:
In [6]:
log['age'] = pd.Timestamp('today') - log.timestamp
log.head()
Out[6]:
In [7]:
log['component'] = log.path.str.split("/").str[0:2].str.join(":")
log.head()
Out[7]:
In [8]:
age_per_component = \
log.groupby(['component'])\
.age.min().sort_values()
age_per_component.head()
Out[8]:
In [9]:
age_per_component.tail(10)
Out[9]:
In [10]:
age_per_component.plot.bar(figsize=[15,5]);
In [11]:
knowledge = log.timestamp > pd.Timestamp('today') - pd.DateOffset(months=6)
knowledge.value_counts()
Out[11]:
In [12]:
knowledge.mean()
Out[12]:
In [13]:
knowledge.value_counts().plot.pie();
In [14]:
log_timed = log.set_index('timestamp')
log_timed.head()
Out[14]:
In [15]:
log_timed = log.groupby([pd.Grouper(key='timestamp', freq='M'), 'component']).line.count()
log_timed.head()
Out[15]:
In [16]:
component_history = log_timed.unstack().fillna(0)
component_history.head()
Out[16]:
In [17]:
relative_history = component_history.apply(
lambda x : x / component_history.sum(axis=1))
relative_history.head()
Out[17]:
In [18]:
relative_history.plot.area(legend=False, figsize=[15,8])
Out[18]: